import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
chicago_df=pd.read_csv('../src/data/Building_Permits.csv')
C:\Users\44742\AppData\Local\Temp\ipykernel_245908\1767199503.py:1: DtypeWarning: Columns (1,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98) have mixed types. Specify dtype option on import or set low_memory=False.
chicago_df=pd.read_csv('../src/data/Building_Permits.csv')
chicago_df.shape
(730511, 119)
chicago_df.iloc[:,50:70].sample(3)
| CONTACT_6_TYPE | CONTACT_6_NAME | CONTACT_6_CITY | CONTACT_6_STATE | CONTACT_6_ZIPCODE | CONTACT_7_TYPE | CONTACT_7_NAME | CONTACT_7_CITY | CONTACT_7_STATE | CONTACT_7_ZIPCODE | CONTACT_8_TYPE | CONTACT_8_NAME | CONTACT_8_CITY | CONTACT_8_STATE | CONTACT_8_ZIPCODE | CONTACT_9_TYPE | CONTACT_9_NAME | CONTACT_9_CITY | CONTACT_9_STATE | CONTACT_9_ZIPCODE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 665460 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 424187 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 17018 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Let's drop columns where the number of nulls is 15% or more
#Calculate the percentage of null rows for each column
col_nulls=chicago_df.isna().sum()*100/chicago_df.shape[0]
#Valid columns (where the number of nulls is less than 15%)
col_valid=col_nulls[col_nulls<15].sort_values(ascending=False)
#Non valid columns (where the number of nulls is 15% or more)
col_non_valid=col_nulls[col_nulls>=15].sort_values(ascending=False)
col_non_valid.index
Index(['CONTACT_15_TYPE', 'CONTACT_15_ZIPCODE', 'CONTACT_15_STATE',
'CONTACT_15_CITY', 'CONTACT_15_NAME', 'CONTACT_14_STATE',
'CONTACT_14_TYPE', 'CONTACT_14_NAME', 'CONTACT_14_CITY',
'CONTACT_14_ZIPCODE', 'CONTACT_13_TYPE', 'CONTACT_13_NAME',
'CONTACT_13_CITY', 'CONTACT_13_STATE', 'CONTACT_13_ZIPCODE',
'CONTACT_12_ZIPCODE', 'CONTACT_12_STATE', 'CONTACT_12_CITY',
'CONTACT_12_TYPE', 'CONTACT_12_NAME', 'CONTACT_11_ZIPCODE',
'CONTACT_11_STATE', 'CONTACT_11_CITY', 'CONTACT_11_NAME',
'CONTACT_11_TYPE', 'PIN10', 'PIN9', 'PIN8', 'CONTACT_10_ZIPCODE',
'CONTACT_10_STATE', 'PIN7', 'CONTACT_10_CITY', 'CONTACT_10_TYPE',
'CONTACT_10_NAME', 'PIN6', 'PIN5', 'PIN4', 'CONTACT_9_ZIPCODE',
'CONTACT_9_STATE', 'CONTACT_9_CITY', 'CONTACT_9_TYPE', 'CONTACT_9_NAME',
'PIN3', 'CONTACT_8_ZIPCODE', 'CONTACT_8_STATE', 'CONTACT_8_CITY',
'CONTACT_8_NAME', 'CONTACT_8_TYPE', 'PIN2', 'CONTACT_7_ZIPCODE',
'CONTACT_7_STATE', 'CONTACT_7_CITY', 'CONTACT_7_NAME', 'CONTACT_7_TYPE',
'CONTACT_6_ZIPCODE', 'CONTACT_6_STATE', 'CONTACT_6_CITY',
'CONTACT_6_NAME', 'CONTACT_6_TYPE', 'CONTACT_5_ZIPCODE',
'CONTACT_5_STATE', 'CONTACT_5_CITY', 'CONTACT_5_TYPE', 'CONTACT_5_NAME',
'CONTACT_4_ZIPCODE', 'CONTACT_4_STATE', 'CONTACT_4_CITY',
'CONTACT_4_NAME', 'CONTACT_4_TYPE', 'CONTACT_3_ZIPCODE',
'CONTACT_3_STATE', 'CONTACT_3_CITY', 'CONTACT_3_NAME', 'CONTACT_3_TYPE',
'CONTACT_2_ZIPCODE', 'CONTACT_2_STATE', 'CONTACT_2_CITY',
'CONTACT_2_NAME', 'CONTACT_2_TYPE'],
dtype='object')
#Drop the non-valid columns
chicago_clean=chicago_df.drop(columns=col_non_valid.index)
chicago_clean;
Let's drop columns that do not hold valuable information for the analysis
chicago_wip=chicago_clean.drop(columns=['ID','PERMIT#','PIN1','XCOORDINATE','YCOORDINATE'])
chicago_wip.shape
(730511, 35)
Let's drop columns that might hold useful information but will be ommited from the initial EDA
chicago_wip2=chicago_wip.drop(columns=['CONTACT_1_STATE','CONTACT_1_ZIPCODE','SUFFIX','CONTACT_1_ZIPCODE','CONTACT_1_NAME','LOCATION'])
print(chicago_wip2.shape)
chicago_wip2;
(730511, 30)
chicago_wip3=chicago_wip2.drop(columns=['STREET_NAME','CONTACT_1_CITY','STREET_NAME','OTHER_FEE_PAID','STREET DIRECTION','ZONING_FEE_PAID','SUBTOTAL_WAIVED','ZONING_FEE_UNPAID','OTHER_FEE_WAIVED',\
'BUILDING_FEE_UNPAID','OTHER_FEE_UNPAID','BUILDING_FEE_WAIVED'])
chicago_wip3
| PERMIT_TYPE | REVIEW_TYPE | APPLICATION_START_DATE | ISSUE_DATE | PROCESSING_TIME | STREET_NUMBER | WORK_DESCRIPTION | BUILDING_FEE_PAID | SUBTOTAL_PAID | SUBTOTAL_UNPAID | ZONING_FEE_WAIVED | TOTAL_FEE | CONTACT_1_TYPE | REPORTED_COST | COMMUNITY_AREA | CENSUS_TRACT | WARD | LATITUDE | LONGITUDE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | PERMIT - RENOVATION/ALTERATION | STANDARD PLAN REVIEW | 10/14/2005 | 01/03/2006 | 81.0 | 2728 | INTERIOR REMODELING OF EXISTING 3 D.U. PER PLA... | 125.0 | 200.0 | 0.0 | 0.0 | 200.0 | OWNER AS GENERAL CONTRACTOR | 3000.0 | NaN | NaN | NaN | 41.930957 | -87.718543 |
| 1 | PERMIT - NEW CONSTRUCTION | STANDARD PLAN REVIEW | 12/05/2005 | 03/29/2006 | 114.0 | 1409 | new construction, erect 3 story slab on grade ... | 1029.6 | 1242.1 | 0.0 | 0.0 | 1242.1 | ARCHITECT | 190000.0 | NaN | NaN | NaN | 41.862155 | -87.728575 |
| 2 | PERMIT - SIGNS | SIGN PERMIT | 10/17/2005 | 01/12/2006 | 87.0 | 120 | north elevation 386431 | 56.0 | 56.0 | 0.0 | 0.0 | 56.0 | CONTRACTOR-ELECTRICAL | 0.0 | NaN | NaN | NaN | 41.880356 | -87.632466 |
| 3 | PERMIT - RENOVATION/ALTERATION | SELF CERT | 01/25/2006 | 01/26/2006 | 1.0 | 71 | Interior alteration of existing partial 34th f... | 2408.0 | 2483.0 | 0.0 | 0.0 | 2483.0 | SELF CERT ARCHITECT | 125000.0 | NaN | NaN | NaN | 41.880718 | -87.636589 |
| 4 | PERMIT - RENOVATION/ALTERATION | SELF CERT | 01/21/2006 | 01/21/2006 | 0.0 | 4901 | Interior alterations to the retail space on th... | 2443.0 | 2518.0 | 0.0 | 0.0 | 2518.0 | SELF CERT ARCHITECT | 41527.0 | NaN | NaN | NaN | 41.803882 | -87.742911 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 730506 | PERMIT - EASY PERMIT PROCESS | EASY PERMIT | 02/15/2023 | 02/24/2023 | 9.0 | 6926 | SOLAR PANEL PERMIT. ERECT A PHOTOVOLTAIC ARRA... | 0.0 | 0.0 | 275.0 | 0.0 | 275.0 | STRUCTURAL ENGINEER | 18031.0 | 65.0 | 650500.0 | 13.0 | 41.766834 | -87.725048 |
| 730507 | PERMIT - EASY PERMIT PROCESS | EASY PERMIT WEB | 02/24/2023 | 02/24/2023 | 0.0 | 2529 | FENCES: 165FT 0IN X 6FT 0IN: QTY 1 (not to be ... | 150.0 | 150.0 | 0.0 | 0.0 | 150.0 | OWNER AS GENERAL CONTRACTOR | 5900.0 | 22.0 | 220400.0 | 32.0 | 41.927464 | -87.701016 |
| 730508 | PERMIT - ELECTRIC WIRING | EASY PERMIT WEB | 02/24/2023 | 02/24/2023 | 0.0 | 1125 | TECH WILL PULL 100 CAT-6 CABLES, TERMINATE, AN... | 75.0 | 75.0 | 0.0 | 0.0 | 75.0 | CONTRACTOR-ELECTRICAL | 9000.0 | 8.0 | 81202.0 | 2.0 | 41.902604 | -87.628254 |
| 730509 | PERMIT - EASY PERMIT PROCESS | EASY PERMIT | 02/24/2023 | 02/24/2023 | 0.0 | 3328 | REVISION TO PERMIT NO 100988130 TO CHANGE ELEC... | 75.0 | 75.0 | 0.0 | 0.0 | 75.0 | CONTRACTOR-ELECTRICAL | 1.0 | 27.0 | 837300.0 | 24.0 | 41.871944 | -87.709470 |
| 730510 | PERMIT - ELECTRIC WIRING | EASY PERMIT WEB | 02/24/2023 | 02/24/2023 | 0.0 | 12423 | REPAIR SERVICE | 75.0 | 75.0 | 0.0 | 0.0 | 75.0 | CONTRACTOR-ELECTRICAL | 500.0 | 53.0 | 530503.0 | 34.0 | 41.668048 | -87.637825 |
730511 rows × 19 columns
Let's check the null values for the remaining columns
#
chicago_wip3.isnull().sum()*100/chicago_wip3.shape[0]
PERMIT_TYPE 0.000000 REVIEW_TYPE 0.000958 APPLICATION_START_DATE 0.071457 ISSUE_DATE 0.000000 PROCESSING_TIME 0.071457 STREET_NUMBER 0.000000 WORK_DESCRIPTION 0.007255 BUILDING_FEE_PAID 0.000000 SUBTOTAL_PAID 0.000000 SUBTOTAL_UNPAID 0.000000 ZONING_FEE_WAIVED 0.000000 TOTAL_FEE 0.000000 CONTACT_1_TYPE 0.653378 REPORTED_COST 0.051060 COMMUNITY_AREA 14.351735 CENSUS_TRACT 14.927770 WARD 14.380482 LATITUDE 0.308551 LONGITUDE 0.308551 dtype: float64
chicago_wip3.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 730511 entries, 0 to 730510 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PERMIT_TYPE 730511 non-null object 1 REVIEW_TYPE 730504 non-null object 2 APPLICATION_START_DATE 729989 non-null object 3 ISSUE_DATE 730511 non-null object 4 PROCESSING_TIME 729989 non-null float64 5 STREET_NUMBER 730511 non-null int64 6 WORK_DESCRIPTION 730458 non-null object 7 BUILDING_FEE_PAID 730511 non-null float64 8 SUBTOTAL_PAID 730511 non-null float64 9 SUBTOTAL_UNPAID 730511 non-null float64 10 ZONING_FEE_WAIVED 730511 non-null float64 11 TOTAL_FEE 730511 non-null float64 12 CONTACT_1_TYPE 725738 non-null object 13 REPORTED_COST 730138 non-null float64 14 COMMUNITY_AREA 625670 non-null float64 15 CENSUS_TRACT 621462 non-null float64 16 WARD 625460 non-null float64 17 LATITUDE 728257 non-null float64 18 LONGITUDE 728257 non-null float64 dtypes: float64(12), int64(1), object(6) memory usage: 105.9+ MB
chicago_wip3.describe()
| PROCESSING_TIME | STREET_NUMBER | BUILDING_FEE_PAID | SUBTOTAL_PAID | SUBTOTAL_UNPAID | ZONING_FEE_WAIVED | TOTAL_FEE | REPORTED_COST | COMMUNITY_AREA | CENSUS_TRACT | WARD | LATITUDE | LONGITUDE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 729989.000000 | 730511.000000 | 730511.000000 | 730511.000000 | 7.305110e+05 | 730511.000000 | 7.305110e+05 | 7.301380e+05 | 625670.000000 | 621462.000000 | 625460.000000 | 728257.000000 | 728257.000000 |
| mean | 22.629837 | 3459.539303 | 675.572852 | 808.444783 | 1.706921e+01 | 1.817716 | 9.478778e+02 | 2.179828e+05 | 31.532530 | 310674.564295 | 27.169176 | 41.870021 | -87.672919 |
| std | 105.683331 | 2978.569953 | 5775.639286 | 6944.309270 | 2.928526e+03 | 135.210814 | 1.157143e+04 | 1.836746e+07 | 21.942272 | 317214.267549 | 15.097453 | 0.083764 | 0.059634 |
| min | -2876.000000 | 1.000000 | 0.000000 | 0.000000 | -1.440000e+04 | 0.000000 | -1.152747e+04 | -1.000000e+03 | 0.000000 | 0.000000 | 1.000000 | 41.644670 | -87.940225 |
| 25% | 0.000000 | 946.000000 | 58.000000 | 75.000000 | 0.000000e+00 | 0.000000 | 7.500000e+01 | 5.000000e+02 | 12.000000 | 7207.000000 | 14.000000 | 41.803620 | -87.707301 |
| 50% | 0.000000 | 2714.000000 | 150.000000 | 200.000000 | 0.000000e+00 | 0.000000 | 2.250000e+02 | 4.500000e+03 | 28.000000 | 210400.000000 | 29.000000 | 41.885830 | -87.662388 |
| 75% | 8.000000 | 5346.000000 | 400.000000 | 450.000000 | 0.000000e+00 | 0.000000 | 5.000000e+02 | 2.000000e+04 | 46.000000 | 630400.000000 | 42.000000 | 41.930764 | -87.631931 |
| max | 5699.000000 | 111601.000000 | 882191.200000 | 999683.920000 | 1.944425e+06 | 102250.000000 | 5.772092e+06 | 1.000000e+10 | 77.000000 | 980100.000000 | 50.000000 | 42.022780 | -87.524677 |
Change data types
chicago_wip3['ISSUE_DATE']=pd.to_datetime(chicago_wip3['ISSUE_DATE'])
chicago_wip3['APPLICATION_START_DATE']=pd.to_datetime(chicago_wip3['APPLICATION_START_DATE'])
chicago_wip3.nunique()
PERMIT_TYPE 11 REVIEW_TYPE 11 APPLICATION_START_DATE 6832 ISSUE_DATE 6171 PROCESSING_TIME 1697 STREET_NUMBER 10690 WORK_DESCRIPTION 563792 BUILDING_FEE_PAID 49426 SUBTOTAL_PAID 53591 SUBTOTAL_UNPAID 1307 ZONING_FEE_WAIVED 137 TOTAL_FEE 56498 CONTACT_1_TYPE 26 REPORTED_COST 45221 COMMUNITY_AREA 78 CENSUS_TRACT 1671 WARD 50 LATITUDE 246124 LONGITUDE 246079 dtype: int64
Let's check if we have consistent numbers of application for all years & wards
chicago_wip3.groupby(chicago_wip3['ISSUE_DATE'].dt.year).count()['PERMIT_TYPE'].plot(figsize=(10,5),grid=True)
<AxesSubplot:xlabel='ISSUE_DATE'>
chicago_wip3.groupby([chicago_wip3['ISSUE_DATE'].dt.year,chicago_wip3['ISSUE_DATE'].dt.month]).count()['PERMIT_TYPE'].plot(figsize=(20,10),grid=True)
<AxesSubplot:xlabel='ISSUE_DATE,ISSUE_DATE'>
yearly_wards=chicago_wip3.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD']).count()['PERMIT_TYPE']
yearly_wards.unstack().T.isna().sum()
ISSUE_DATE 2006 47 2007 25 2008 0 2009 0 2010 0 2011 0 2012 0 2013 0 2014 0 2015 0 2016 0 2017 0 2018 0 2019 0 2020 0 2021 0 2022 0 2023 0 dtype: int64
Let's only consider data from 2008 onwards. The data before 2008 appears incomplete
[chicago_wip3['ISSUE_DATE'].dt.year>=2008]
[0 False
1 False
2 False
3 False
4 False
...
730506 True
730507 True
730508 True
730509 True
730510 True
Name: ISSUE_DATE, Length: 730511, dtype: bool]
chicago_wip4=chicago_wip3[chicago_wip3['ISSUE_DATE'].dt.year>=2008]
chicago_wip4.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 635827 entries, 20 to 730510 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PERMIT_TYPE 635827 non-null object 1 REVIEW_TYPE 635820 non-null object 2 APPLICATION_START_DATE 635664 non-null datetime64[ns] 3 ISSUE_DATE 635827 non-null datetime64[ns] 4 PROCESSING_TIME 635664 non-null float64 5 STREET_NUMBER 635827 non-null int64 6 WORK_DESCRIPTION 635807 non-null object 7 BUILDING_FEE_PAID 635827 non-null float64 8 SUBTOTAL_PAID 635827 non-null float64 9 SUBTOTAL_UNPAID 635827 non-null float64 10 ZONING_FEE_WAIVED 635827 non-null float64 11 TOTAL_FEE 635827 non-null float64 12 CONTACT_1_TYPE 632737 non-null object 13 REPORTED_COST 635816 non-null float64 14 COMMUNITY_AREA 625605 non-null float64 15 CENSUS_TRACT 621397 non-null float64 16 WARD 625395 non-null float64 17 LATITUDE 633882 non-null float64 18 LONGITUDE 633882 non-null float64 dtypes: datetime64[ns](2), float64(12), int64(1), object(4) memory usage: 97.0+ MB
chicago_wip4.nunique()
PERMIT_TYPE 11 REVIEW_TYPE 11 APPLICATION_START_DATE 6030 ISSUE_DATE 5502 PROCESSING_TIME 1408 STREET_NUMBER 10546 WORK_DESCRIPTION 492558 BUILDING_FEE_PAID 42706 SUBTOTAL_PAID 46792 SUBTOTAL_UNPAID 1179 ZONING_FEE_WAIVED 123 TOTAL_FEE 49062 CONTACT_1_TYPE 26 REPORTED_COST 42204 COMMUNITY_AREA 78 CENSUS_TRACT 1671 WARD 50 LATITUDE 219246 LONGITUDE 219218 dtype: int64
chicago_wip4['CENSUS_TRACT']
20 NaN
32 611900.0
33 830600.0
34 170600.0
46 100500.0
...
730506 650500.0
730507 220400.0
730508 81202.0
730509 837300.0
730510 530503.0
Name: CENSUS_TRACT, Length: 635827, dtype: float64
yearly_wards=chicago_wip4.groupby([(chicago_wip4['ISSUE_DATE']).dt.year,'WARD']).count()['PERMIT_TYPE']
yearly_wards.unstack().plot(figsize=(30,15),grid=True,title='Total Number of Applications by Ward')
<AxesSubplot:title={'center':'Total Number of Applications by Ward'}, xlabel='ISSUE_DATE'>
reported_cost_wards_total=chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD'])['REPORTED_COST'].sum()
reported_cost_wards=chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD'])['REPORTED_COST'].sum()/chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'WARD'])['REPORTED_COST'].count()
reported_cost_wards.unstack().plot(figsize=(30,15), grid=True, title='AVG reported cost per application by ward')
<AxesSubplot:title={'center':'AVG reported cost per application by ward'}, xlabel='ISSUE_DATE'>
reported_cost_community=chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'COMMUNITY_AREA'])['REPORTED_COST'].sum()/chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,'COMMUNITY_AREA'])['REPORTED_COST'].count()
reported_cost_community.unstack().plot(figsize=(30,15), grid=True, title='AVG reported cost per application by community area');
#What types of Permit Types are there?
chicago_wip4['PERMIT_TYPE'].unique()
array(['PERMIT - SIGNS', 'PERMIT - WRECKING/DEMOLITION',
'PERMIT - EASY PERMIT PROCESS', 'PERMIT - RENOVATION/ALTERATION',
'PERMIT - ELECTRIC WIRING', 'PERMIT - NEW CONSTRUCTION',
'PERMIT - ELEVATOR EQUIPMENT', 'PERMIT - REINSTATE REVOKED PMT',
'PERMIT - SCAFFOLDING', 'PERMIT - FOR EXTENSION OF PMT',
'PERMIT - PORCH CONSTRUCTION'], dtype=object)
chicago_wip4.groupby('PERMIT_TYPE').count()['ISSUE_DATE'].plot(kind='bar')
<AxesSubplot:xlabel='PERMIT_TYPE'>
#Can text processing be applied for this column to find any patterns/ form clusters?
chicago_wip4['WORK_DESCRIPTION'].sample(20)
76774 HOT WATER HEATER (REPLACEMENT ONLY): QTY 1 454393 REPLACE WIRING, OUTLETS AND LIGHT FIXTURES 427136 REPLACEMENT OF FRONT WOOD PORCH IN SAME LOCATI... 86309 REPLACE 5 EXISTING GLASS BLOCK. REPAIR CONCRET... 183000 INSTALLATION OF ELECTRICAL TIMER SWITCH 81463 INSTALL LOW VOLTAGE BURGLAR ALARM. 325615 SELF CERT: INTERIOR ALTERATIONS TO 32ND FLOOR ... 76703 GARAGE 484130 GARAGE 533248 ADDRESS CODE VIOLATIONS ALL NEW DRYWALL EW FIX... 637941 INSTALL LOW VOLTAGE SOUND SYSTEM 170057 INSTALLATION OF LOW VOLTAGE BURGLAR ALARM # 34... 270098 SEPTEMBER MAINTENANCE/COMPUTER/PHONE/WIRELESS ... 287470 ALTERATIONS TO FIRST FLOOR TENANT LOBBY AND 5T... 60886 INTERIOR RENOVATION AND REAR 2 STORY ADDITION 174204 CONCRETE PATCHING & REPAIRS AT GRADE LEVEL BUI... 611780 All Elevation, repair/replace sealant 6000 Lin... 207138 GENERAL MONTHLY MAINTENANCE, NOVEMBER, 2011; J... 682854 INTERIOR ALTERATIONS & REPLACE ENCLOSED FRAME ... 425371 GARAGE Name: WORK_DESCRIPTION, dtype: object
Option 1: Great (but would require linking it to US Census Database)
[Defintion] Census Tracts are small, relatively permanent statistical subdivisions of a county or statistically equivalent entity that can be updated by local participants prior to each decennial census as part of the Census Bureau's Participant Statistical Areas Program (PSAP) https://www.census.gov/programs-surveys/geography/about/glossary.html#:~:text=Census%20Tracts%20are%20small%2C%20relatively,Statistical%20Areas%20Program%20(PSAP).
Option 2: Divide by WARD or COMMUNITY_AREA
chicago_wip4['WARD'].unique()
array([nan, 20., 40., 38., 41., 24., 23., 29., 11., 46., 8., 15., 42.,
43., 2., 45., 27., 25., 39., 48., 1., 44., 7., 6., 14., 37.,
32., 4., 35., 21., 10., 50., 33., 22., 47., 26., 9., 5., 34.,
19., 12., 17., 13., 18., 3., 31., 28., 49., 36., 30., 16.])
chicago_wip4['COMMUNITY_AREA'].unique()
array([nan, 61., 1., 17., 10., 29., 76., 65., 15., 25., 3., 43., 66.,
8., 28., 33., 34., 13., 16., 77., 24., 32., 57., 6., 69., 58.,
22., 21., 71., 46., 2., 14., 51., 11., 5., 20., 53., 44., 7.,
30., 60., 75., 12., 64., 72., 63., 48., 73., 70., 59., 37., 56.,
4., 39., 35., 49., 42., 23., 27., 19., 45., 50., 74., 68., 0.,
52., 31., 67., 55., 41., 47., 9., 18., 26., 62., 38., 40., 54.,
36.])
Ultimately, the gropgraphic division will depend on the structure of the target variable dataset
Let's load out current target variable dataset.
prices_df_raw=pd.read_csv('../src/data/Chicago_price_index_data.csv')
prices_df_raw.head();
#let's drop unnecessary columns
prices_df_wip=prices_df.drop(columns=['COOK','SUBURBS'])
prices_df_wip.head()
| YEARQ | CHICAGO | Palatine/Barrington | Melrose Park/Maywood | Oak Park/Cicero | LaGrange/Burbank | Orland Park/Lemont | Oak Lawn/Blue Island | Oak Forest/Country Club Hills | Calumet City/Harvey | ... | Chicago--Logan Square/Avondale | Chicago--Humboldt Park/Garfield Park | Chicago--West Town/Near West Side | Chicago--Bridgeport/Brighton Park | Chicago--Gage Park/West Lawn | Chicago--Englewood/Greater Grand Crossing | Chicago--Bronzeville/Hyde Park | Chicago--Beverly/Morgan Park | Chicago--Auburn Gresham/Chatham | Chicago--South Chicago/West Pullman | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1997Q1 | 80.51 | 88.45 | 90.59 | 82.87 | 84.73 | 92.37 | 92.80 | 93.44 | 91.80 | ... | 70.60 | 82.13 | 70.66 | 79.70 | 88.14 | 85.80 | 61.49 | 81.30 | 80.24 | 84.16 |
| 1 | 1997Q2 | 81.74 | 88.13 | 90.05 | 85.86 | 87.83 | 91.98 | 92.88 | 94.72 | 93.23 | ... | 72.17 | 86.80 | 70.21 | 82.34 | 89.83 | 86.00 | 72.59 | 85.60 | 84.36 | 90.44 |
| 2 | 1997Q3 | 83.66 | 88.66 | 90.26 | 86.83 | 89.65 | 93.76 | 93.68 | 95.50 | 93.36 | ... | 73.73 | 92.90 | 69.67 | 86.17 | 90.24 | 86.01 | 76.99 | 87.86 | 86.57 | 90.62 |
| 3 | 1997Q4 | 84.18 | 88.37 | 91.27 | 86.14 | 90.78 | 94.15 | 94.09 | 95.47 | 93.54 | ... | 72.93 | 93.54 | 68.52 | 88.45 | 90.15 | 87.37 | 78.16 | 89.31 | 87.62 | 89.88 |
| 4 | 1998Q1 | 85.49 | 88.83 | 90.71 | 86.89 | 91.34 | 94.44 | 94.40 | 95.97 | 93.71 | ... | 74.22 | 95.94 | 69.15 | 91.34 | 90.62 | 89.48 | 85.44 | 91.14 | 89.50 | 90.64 |
5 rows × 35 columns
Current dataset appears to be divided by COMMUNITY AREAS
year=[year_.split('Q')[0] for year_ in prices_df_wip['YEARQ']]
year=[int(y) for y in year]
prices_df_wip.insert(1,'YEAR',year)
quarter=[year_.split('Q')[1] for year_ in prices_df_wip['YEARQ']]
quarter=[int(q) for q in quarter]
prices_df_wip.insert(2,'QUARTER',quarter)
prices_df_wip.head();
prices_df_wip2=prices_df_wip[prices_df_wip['YEAR']>=2008]
prices_df_wip3=prices_df_wip2.drop(columns='YEARQ')
prices_df_wip3.columns[3:]
Index(['Palatine/Barrington', 'Melrose Park/Maywood', 'Oak Park/Cicero',
'LaGrange/Burbank', 'Orland Park/Lemont', 'Oak Lawn/Blue Island',
'Oak Forest/Country Club Hills', 'Calumet City/Harvey',
'Chicago Heights/Park Forest', 'Arlington Heights/Wheeling',
'Winnetka/Northbrook', 'Hoffman Estates/Streamwood', 'Schaumburg',
'Mount Prospect/Elk Grove Village', 'Park Ridge/Des Plaines',
'Evanston/Skokie', 'Elmwood Park/Franklin Park',
'Chicago--Uptown/Rogers Park', 'Chicago--Lake View/Lincoln Park',
'Chicago--Lincoln Square/North Center',
'Chicago--Irving Park/Albany Park',
'Chicago--Portage Park/Jefferson Park',
'Chicago--Austin/Belmont Cragin', 'Chicago--Logan Square/Avondale',
'Chicago--Humboldt Park/Garfield Park',
'Chicago--West Town/Near West Side',
'Chicago--Bridgeport/Brighton Park', 'Chicago--Gage Park/West Lawn',
'Chicago--Englewood/Greater Grand Crossing',
'Chicago--Bronzeville/Hyde Park', 'Chicago--Beverly/Morgan Park',
'Chicago--Auburn Gresham/Chatham',
'Chicago--South Chicago/West Pullman'],
dtype='object')
#Let's select neighbourhood names NOT starting with Chicago
non_chicago=[name for name in prices_df_wip3.columns[3:] if name[0:7]!='Chicago']
non_chicago
['Palatine/Barrington', 'Melrose Park/Maywood', 'Oak Park/Cicero', 'LaGrange/Burbank', 'Orland Park/Lemont', 'Oak Lawn/Blue Island', 'Oak Forest/Country Club Hills', 'Calumet City/Harvey', 'Arlington Heights/Wheeling', 'Winnetka/Northbrook', 'Hoffman Estates/Streamwood', 'Schaumburg', 'Mount Prospect/Elk Grove Village', 'Park Ridge/Des Plaines', 'Evanston/Skokie', 'Elmwood Park/Franklin Park']
prices_df_wip4=prices_df_wip3.drop(columns=non_chicago)
We need to assign a Community Area number to each column to be able to match it with the main (permit applications) dataset
prices_df_wip4.head()
| YEAR | QUARTER | CHICAGO | Chicago Heights/Park Forest | Chicago--Uptown/Rogers Park | Chicago--Lake View/Lincoln Park | Chicago--Lincoln Square/North Center | Chicago--Irving Park/Albany Park | Chicago--Portage Park/Jefferson Park | Chicago--Austin/Belmont Cragin | Chicago--Logan Square/Avondale | Chicago--Humboldt Park/Garfield Park | Chicago--West Town/Near West Side | Chicago--Bridgeport/Brighton Park | Chicago--Gage Park/West Lawn | Chicago--Englewood/Greater Grand Crossing | Chicago--Bronzeville/Hyde Park | Chicago--Beverly/Morgan Park | Chicago--Auburn Gresham/Chatham | Chicago--South Chicago/West Pullman | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 44 | 2008 | 1 | 199.90 | 155.62 | 186.57 | 169.84 | 202.53 | 189.77 | 186.95 | 215.63 | 218.86 | 278.41 | 227.74 | 234.84 | 201.88 | 242.33 | 239.50 | 186.94 | 187.41 | 187.56 |
| 45 | 2008 | 2 | 194.15 | 148.34 | 185.11 | 173.84 | 199.33 | 185.57 | 179.89 | 208.45 | 219.68 | 262.04 | 225.33 | 229.64 | 194.27 | 231.29 | 233.37 | 179.23 | 183.09 | 182.65 |
| 46 | 2008 | 3 | 185.49 | 140.11 | 173.78 | 169.24 | 192.16 | 178.29 | 170.94 | 195.23 | 219.40 | 232.32 | 225.68 | 222.22 | 184.73 | 213.16 | 216.28 | 172.93 | 176.09 | 171.06 |
| 47 | 2008 | 4 | 177.70 | 133.30 | 177.28 | 165.41 | 188.11 | 173.33 | 164.78 | 177.94 | 213.48 | 198.29 | 223.29 | 217.12 | 175.37 | 202.28 | 198.29 | 165.64 | 169.20 | 164.49 |
| 48 | 2009 | 1 | 168.42 | 123.70 | 172.92 | 165.58 | 183.28 | 168.44 | 161.09 | 163.10 | 207.91 | 170.17 | 215.20 | 202.45 | 166.22 | 175.62 | 182.58 | 157.68 | 158.34 | 157.86 |
prices_df_wip4.reset_index(inplace=True)
prices_df_wip5=prices_df_wip4.drop(columns=['index','Chicago Heights/Park Forest'])
Need to assign the community area number to each column
prices_df_wip5.head()
| YEAR | QUARTER | CHICAGO | Chicago--Uptown/Rogers Park | Chicago--Lake View/Lincoln Park | Chicago--Lincoln Square/North Center | Chicago--Irving Park/Albany Park | Chicago--Portage Park/Jefferson Park | Chicago--Austin/Belmont Cragin | Chicago--Logan Square/Avondale | Chicago--Humboldt Park/Garfield Park | Chicago--West Town/Near West Side | Chicago--Bridgeport/Brighton Park | Chicago--Gage Park/West Lawn | Chicago--Englewood/Greater Grand Crossing | Chicago--Bronzeville/Hyde Park | Chicago--Beverly/Morgan Park | Chicago--Auburn Gresham/Chatham | Chicago--South Chicago/West Pullman | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 199.90 | 186.57 | 169.84 | 202.53 | 189.77 | 186.95 | 215.63 | 218.86 | 278.41 | 227.74 | 234.84 | 201.88 | 242.33 | 239.50 | 186.94 | 187.41 | 187.56 |
| 1 | 2008 | 2 | 194.15 | 185.11 | 173.84 | 199.33 | 185.57 | 179.89 | 208.45 | 219.68 | 262.04 | 225.33 | 229.64 | 194.27 | 231.29 | 233.37 | 179.23 | 183.09 | 182.65 |
| 2 | 2008 | 3 | 185.49 | 173.78 | 169.24 | 192.16 | 178.29 | 170.94 | 195.23 | 219.40 | 232.32 | 225.68 | 222.22 | 184.73 | 213.16 | 216.28 | 172.93 | 176.09 | 171.06 |
| 3 | 2008 | 4 | 177.70 | 177.28 | 165.41 | 188.11 | 173.33 | 164.78 | 177.94 | 213.48 | 198.29 | 223.29 | 217.12 | 175.37 | 202.28 | 198.29 | 165.64 | 169.20 | 164.49 |
| 4 | 2009 | 1 | 168.42 | 172.92 | 165.58 | 183.28 | 168.44 | 161.09 | 163.10 | 207.91 | 170.17 | 215.20 | 202.45 | 166.22 | 175.62 | 182.58 | 157.68 | 158.34 | 157.86 |
#for each column name [1] store the series, [2]
di={col:col.split('--')[1] for col in prices_df_wip5.columns[3:]}
di;
prices_df_wip6=prices_df_wip5.rename(columns=di)
prices_df_wip6.head()
| YEAR | QUARTER | CHICAGO | Uptown/Rogers Park | Lake View/Lincoln Park | Lincoln Square/North Center | Irving Park/Albany Park | Portage Park/Jefferson Park | Austin/Belmont Cragin | Logan Square/Avondale | Humboldt Park/Garfield Park | West Town/Near West Side | Bridgeport/Brighton Park | Gage Park/West Lawn | Englewood/Greater Grand Crossing | Bronzeville/Hyde Park | Beverly/Morgan Park | Auburn Gresham/Chatham | South Chicago/West Pullman | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 199.90 | 186.57 | 169.84 | 202.53 | 189.77 | 186.95 | 215.63 | 218.86 | 278.41 | 227.74 | 234.84 | 201.88 | 242.33 | 239.50 | 186.94 | 187.41 | 187.56 |
| 1 | 2008 | 2 | 194.15 | 185.11 | 173.84 | 199.33 | 185.57 | 179.89 | 208.45 | 219.68 | 262.04 | 225.33 | 229.64 | 194.27 | 231.29 | 233.37 | 179.23 | 183.09 | 182.65 |
| 2 | 2008 | 3 | 185.49 | 173.78 | 169.24 | 192.16 | 178.29 | 170.94 | 195.23 | 219.40 | 232.32 | 225.68 | 222.22 | 184.73 | 213.16 | 216.28 | 172.93 | 176.09 | 171.06 |
| 3 | 2008 | 4 | 177.70 | 177.28 | 165.41 | 188.11 | 173.33 | 164.78 | 177.94 | 213.48 | 198.29 | 223.29 | 217.12 | 175.37 | 202.28 | 198.29 | 165.64 | 169.20 | 164.49 |
| 4 | 2009 | 1 | 168.42 | 172.92 | 165.58 | 183.28 | 168.44 | 161.09 | 163.10 | 207.91 | 170.17 | 215.20 | 202.45 | 166.22 | 175.62 | 182.58 | 157.68 | 158.34 | 157.86 |
prices_df_wip7=prices_df_wip6.copy()
for col in prices_df_wip6.columns[3:]:
temp=prices_df_wip6[col]
prices_df_wip7[col.split('/')[0]]=temp
prices_df_wip7[col.split('/')[1]]=temp
prices_df_wip7=prices_df_wip7.drop(columns=col)
prices_df_wip7.head(1)
| YEAR | QUARTER | CHICAGO | Uptown | Rogers Park | Lake View | Lincoln Park | Lincoln Square | North Center | Irving Park | ... | Englewood | Greater Grand Crossing | Bronzeville | Hyde Park | Beverly | Morgan Park | Auburn Gresham | Chatham | South Chicago | West Pullman | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 199.9 | 186.57 | 186.57 | 169.84 | 169.84 | 202.53 | 202.53 | 189.77 | ... | 242.33 | 242.33 | 239.5 | 239.5 | 186.94 | 186.94 | 187.41 | 187.41 | 187.56 | 187.56 |
1 rows × 35 columns
prices_df_wip7.columns
Index(['YEAR', 'QUARTER', 'CHICAGO', 'Uptown', 'Rogers Park', 'Lake View',
'Lincoln Park', 'Lincoln Square', 'North Center', 'Irving Park',
'Albany Park', 'Portage Park', 'Jefferson Park', 'Austin',
'Belmont Cragin', 'Logan Square', 'Avondale', 'Humboldt Park',
'Garfield Park', 'West Town', 'Near West Side', 'Bridgeport',
'Brighton Park', 'Gage Park', 'West Lawn', 'Englewood',
'Greater Grand Crossing', 'Bronzeville', 'Hyde Park', 'Beverly',
'Morgan Park', 'Auburn Gresham', 'Chatham', 'South Chicago',
'West Pullman'],
dtype='object')
map_dict ={'Uptown':3, 'Rogers Park':1,
'Lake View':6, 'Lincoln Park':7, 'Lincoln Square':4, 'North Center':5,
'Irving Park':16, 'Albany Park':14, 'Portage Park':15, 'Jefferson Park':11,
'Austin':25, 'Belmont Cragin':19, 'Logan Square':22, 'Avondale':21, 'Humboldt Park':23,
'Garfield Park':26, 'West Town':24, 'Near West Side':28, 'Bronzeville': 38, 'Bridgeport':60,
'Brighton Park':58, 'Gage Park':63, 'West Lawn':65, 'Englewood':68,
'Greater Grand Crossing':69, 'Hyde Park':41, 'Beverly':72,
'Morgan Park':75, 'Auburn Gresham':71, 'Chatham':44, 'South Chicago':46,
'West Pullman':53}
prices_df_wip8=prices_df_wip7.rename(columns=map_dict)
prices_df_wip8.columns
Index([ 'YEAR', 'QUARTER', 'CHICAGO', 3, 1, 6,
7, 4, 5, 16, 14, 15,
11, 25, 19, 22, 21, 23,
26, 24, 28, 60, 58, 63,
65, 68, 69, 38, 41, 72,
75, 71, 44, 46, 53],
dtype='object')
#prices_df_wip9=prices_df_wip8.drop(columns=['Heights','Park Forest','Bronzeville'])
prices_df_wip8.groupby(['YEAR','QUARTER']).mean().plot(figsize=(30,15),grid=True)
<AxesSubplot:xlabel='YEAR,QUARTER'>
areas=list(prices_df_wip8.columns)[3:]
reported_cost_community2=chicago_wip4.groupby([chicago_wip4['ISSUE_DATE'].dt.year,chicago_wip4['ISSUE_DATE'].dt.quarter,'COMMUNITY_AREA'])['REPORTED_COST'].sum()/chicago_wip4.groupby([chicago_wip3['ISSUE_DATE'].dt.year,chicago_wip4['ISSUE_DATE'].dt.quarter,'COMMUNITY_AREA'])['REPORTED_COST'].count()
reported_cost_community2.unstack()[areas].plot(figsize=(30,15), grid=True, title='AVG reported cost per application by community area');
renov_df=chicago_wip4[chicago_wip4['PERMIT_TYPE']=='PERMIT - RENOVATION/ALTERATION']
reported_cost_renov=renov_df.groupby([renov_df['ISSUE_DATE'].dt.year,renov_df['ISSUE_DATE'].dt.quarter,'COMMUNITY_AREA'])['REPORTED_COST'].sum()
reported_cost_renov.unstack()[areas].plot(figsize=(30,15), grid=True, title='AVG reported cost per renovation application by community area');
ok, let's take the average of all predicted costs for each community and, then, calculate the percentage deviation from the average for each quarter
reported_cost_renov
ISSUE_DATE ISSUE_DATE COMMUNITY_AREA
2008 1 1.0 281000.0
2.0 607600.0
3.0 982000.0
4.0 315250.0
5.0 1026460.0
...
2023 1 73.0 1173000.0
74.0 857270.0
75.0 1398685.0
76.0 40112589.0
77.0 6861061.0
Name: REPORTED_COST, Length: 4649, dtype: float64
reported_cost_renov2=reported_cost_renov.unstack()
reported_cost_renov2.head()
| COMMUNITY_AREA | 0.0 | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | 9.0 | ... | 68.0 | 69.0 | 70.0 | 71.0 | 72.0 | 73.0 | 74.0 | 75.0 | 76.0 | 77.0 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ISSUE_DATE | ISSUE_DATE | |||||||||||||||||||||
| 2008 | 1 | NaN | 281000.0 | 607600.0 | 982000.0 | 315250.0 | 1026460.0 | 3153300.0 | 3638001.0 | 38789306.0 | 230000.0 | ... | 186300.0 | 54000.0 | 132000.0 | 254788.5 | 189000.0 | 55694.0 | 120000.00 | 425000.0 | 5265000.0 | 659470.0 |
| 2 | NaN | 1748300.0 | 2962600.0 | 8330545.0 | 3889175.0 | 15183874.0 | 8453456.0 | 6514764.0 | 73355910.0 | 903000.0 | ... | 872432.5 | 608500.0 | 1003400.0 | 663161.0 | 2987500.0 | 188120.0 | 2290853.12 | 1719156.0 | 5292701.0 | 5368979.0 | |
| 3 | NaN | 1177480.0 | 2347900.0 | 5618026.0 | 4004101.0 | 7515100.0 | 14441690.0 | 9078501.0 | 71911931.0 | 732900.0 | ... | 2574300.0 | 6033376.0 | 611041.0 | 1142500.0 | 773500.0 | 4315040.0 | 1548840.00 | 922000.0 | 6785600.0 | 5119980.0 | |
| 4 | NaN | 4763102.0 | 1495120.0 | 6630023.0 | 6984001.0 | 2602700.0 | 15027153.0 | 9567755.0 | 261083044.0 | 165800.0 | ... | 722000.0 | 4199138.0 | 898800.0 | 3203069.0 | 891600.0 | 1317155.0 | 742000.00 | 2571375.0 | 4109706.0 | 11691700.0 | |
| 2009 | 1 | NaN | 3077900.0 | 1094800.0 | 8337900.0 | 2045583.0 | 5945896.0 | 12917878.0 | 8293880.0 | 77050278.0 | 715000.0 | ... | 3320367.0 | 1027160.0 | 1831000.0 | 228800.0 | 41500.0 | 291800.0 | 2425000.00 | 940125.0 | 2720620.0 | 5403770.0 |
5 rows × 78 columns
reported_cost_renov2.index[:8]
MultiIndex([(2008, 1),
(2008, 2),
(2008, 3),
(2008, 4),
(2009, 1),
(2009, 2),
(2009, 3),
(2009, 4)],
names=['ISSUE_DATE', 'ISSUE_DATE'])
years=[]
quarters=[]
for i in reported_cost_renov2.index:
years.append(i[0])
quarters.append(i[1])
reported_cost_renov3=reported_cost_renov2.reset_index(drop=True)
reported_cost_renov3.drop(columns=0.0,inplace=True)
reported_cost_renov3.insert(0,'YEAR',years)
reported_cost_renov3.insert(1,'QUARTER',quarters)
reported_cost_renov3.head()
| COMMUNITY_AREA | YEAR | QUARTER | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | ... | 68.0 | 69.0 | 70.0 | 71.0 | 72.0 | 73.0 | 74.0 | 75.0 | 76.0 | 77.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 281000.0 | 607600.0 | 982000.0 | 315250.0 | 1026460.0 | 3153300.0 | 3638001.0 | 38789306.0 | ... | 186300.0 | 54000.0 | 132000.0 | 254788.5 | 189000.0 | 55694.0 | 120000.00 | 425000.0 | 5265000.0 | 659470.0 |
| 1 | 2008 | 2 | 1748300.0 | 2962600.0 | 8330545.0 | 3889175.0 | 15183874.0 | 8453456.0 | 6514764.0 | 73355910.0 | ... | 872432.5 | 608500.0 | 1003400.0 | 663161.0 | 2987500.0 | 188120.0 | 2290853.12 | 1719156.0 | 5292701.0 | 5368979.0 |
| 2 | 2008 | 3 | 1177480.0 | 2347900.0 | 5618026.0 | 4004101.0 | 7515100.0 | 14441690.0 | 9078501.0 | 71911931.0 | ... | 2574300.0 | 6033376.0 | 611041.0 | 1142500.0 | 773500.0 | 4315040.0 | 1548840.00 | 922000.0 | 6785600.0 | 5119980.0 |
| 3 | 2008 | 4 | 4763102.0 | 1495120.0 | 6630023.0 | 6984001.0 | 2602700.0 | 15027153.0 | 9567755.0 | 261083044.0 | ... | 722000.0 | 4199138.0 | 898800.0 | 3203069.0 | 891600.0 | 1317155.0 | 742000.00 | 2571375.0 | 4109706.0 | 11691700.0 |
| 4 | 2009 | 1 | 3077900.0 | 1094800.0 | 8337900.0 | 2045583.0 | 5945896.0 | 12917878.0 | 8293880.0 | 77050278.0 | ... | 3320367.0 | 1027160.0 | 1831000.0 | 228800.0 | 41500.0 | 291800.0 | 2425000.00 | 940125.0 | 2720620.0 | 5403770.0 |
5 rows × 79 columns
avg_costs=reported_cost_renov3.iloc[:,2:].mean()
avg_costs
COMMUNITY_AREA
1.0 6.703444e+06
2.0 4.263048e+06
3.0 1.242579e+07
4.0 6.658883e+06
5.0 7.783004e+06
...
73.0 1.639715e+06
74.0 2.094342e+06
75.0 2.545023e+06
76.0 1.297205e+08
77.0 7.733792e+06
Length: 77, dtype: float64
reported_cost_renov4=reported_cost_renov3.copy()
for col in reported_cost_renov3.columns[2:]:
reported_cost_renov4[col]=reported_cost_renov3[col]/avg_costs[col]
reported_cost_renov4.tail(1)
| COMMUNITY_AREA | YEAR | QUARTER | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | ... | 68.0 | 69.0 | 70.0 | 71.0 | 72.0 | 73.0 | 74.0 | 75.0 | 76.0 | 77.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 60 | 2023 | 1 | 0.246773 | 0.499056 | 1.565268 | 0.39376 | 0.480358 | 0.361615 | 0.529522 | 0.249376 | ... | 0.121149 | 6.474478 | 0.17894 | 0.450188 | 0.390287 | 0.715368 | 0.409327 | 0.549577 | 0.309223 | 0.887154 |
1 rows × 79 columns
We have visualized the house prices per community area - but is it an actually meaningful variable? In other words, is there a stistically significant difference between how the prices were changing for each community?
prices_df_wip8.head()
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 199.90 | 186.57 | 186.57 | 169.84 | 169.84 | 202.53 | 202.53 | 189.77 | ... | 242.33 | 242.33 | 239.50 | 239.50 | 186.94 | 186.94 | 187.41 | 187.41 | 187.56 | 187.56 |
| 1 | 2008 | 2 | 194.15 | 185.11 | 185.11 | 173.84 | 173.84 | 199.33 | 199.33 | 185.57 | ... | 231.29 | 231.29 | 233.37 | 233.37 | 179.23 | 179.23 | 183.09 | 183.09 | 182.65 | 182.65 |
| 2 | 2008 | 3 | 185.49 | 173.78 | 173.78 | 169.24 | 169.24 | 192.16 | 192.16 | 178.29 | ... | 213.16 | 213.16 | 216.28 | 216.28 | 172.93 | 172.93 | 176.09 | 176.09 | 171.06 | 171.06 |
| 3 | 2008 | 4 | 177.70 | 177.28 | 177.28 | 165.41 | 165.41 | 188.11 | 188.11 | 173.33 | ... | 202.28 | 202.28 | 198.29 | 198.29 | 165.64 | 165.64 | 169.20 | 169.20 | 164.49 | 164.49 |
| 4 | 2009 | 1 | 168.42 | 172.92 | 172.92 | 165.58 | 165.58 | 183.28 | 183.28 | 168.44 | ... | 175.62 | 175.62 | 182.58 | 182.58 | 157.68 | 157.68 | 158.34 | 158.34 | 157.86 | 157.86 |
5 rows × 35 columns
city_change=(prices_df_wip8['CHICAGO'].shift(1)-prices_df_wip8['CHICAGO'])
#price_change=(prices_df_wip8.shift(1)-prices_df_wip8)[2:]
prices_df_wip8.head()
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 199.90 | 186.57 | 186.57 | 169.84 | 169.84 | 202.53 | 202.53 | 189.77 | ... | 242.33 | 242.33 | 239.50 | 239.50 | 186.94 | 186.94 | 187.41 | 187.41 | 187.56 | 187.56 |
| 1 | 2008 | 2 | 194.15 | 185.11 | 185.11 | 173.84 | 173.84 | 199.33 | 199.33 | 185.57 | ... | 231.29 | 231.29 | 233.37 | 233.37 | 179.23 | 179.23 | 183.09 | 183.09 | 182.65 | 182.65 |
| 2 | 2008 | 3 | 185.49 | 173.78 | 173.78 | 169.24 | 169.24 | 192.16 | 192.16 | 178.29 | ... | 213.16 | 213.16 | 216.28 | 216.28 | 172.93 | 172.93 | 176.09 | 176.09 | 171.06 | 171.06 |
| 3 | 2008 | 4 | 177.70 | 177.28 | 177.28 | 165.41 | 165.41 | 188.11 | 188.11 | 173.33 | ... | 202.28 | 202.28 | 198.29 | 198.29 | 165.64 | 165.64 | 169.20 | 169.20 | 164.49 | 164.49 |
| 4 | 2009 | 1 | 168.42 | 172.92 | 172.92 | 165.58 | 165.58 | 183.28 | 183.28 | 168.44 | ... | 175.62 | 175.62 | 182.58 | 182.58 | 157.68 | 157.68 | 158.34 | 158.34 | 157.86 | 157.86 |
5 rows × 35 columns
pr_change_df=prices_df_wip8.iloc[:,0:2]
pr_change_df.head()
| YEAR | QUARTER | |
|---|---|---|
| 0 | 2008 | 1 |
| 1 | 2008 | 2 |
| 2 | 2008 | 3 |
| 3 | 2008 | 4 |
| 4 | 2009 | 1 |
#need to divide each field by city_change to create a new column in the new df pr_change_df
for col in prices_df_wip8.columns[2:]:
pr_change_df[col]=(prices_df_wip8[col].shift(1)-prices_df_wip8[col])/city_change
pr_change_df2=pr_change_df.drop(index=0).reset_index().drop(columns='index')
pr_change_df2.tail()
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 52 | 2021 | 2 | 1.0 | 0.073409 | 0.073409 | 1.615008 | 1.615008 | 1.504078 | 1.504078 | 1.336052 | ... | 1.081566 | 1.081566 | 1.019576 | 1.019576 | 1.181077 | 1.181077 | 1.122349 | 1.122349 | 0.603589 | 0.603589 |
| 53 | 2021 | 3 | 1.0 | 0.431818 | 0.431818 | 0.708556 | 0.708556 | 0.707219 | 0.707219 | 0.754011 | ... | 1.516043 | 1.516043 | 1.609626 | 1.609626 | 1.036096 | 1.036096 | 1.508021 | 1.508021 | 1.402406 | 1.402406 |
| 54 | 2021 | 4 | 1.0 | 0.305825 | 0.305825 | 0.359223 | 0.359223 | 0.517799 | 0.517799 | 1.079288 | ... | 1.407767 | 1.407767 | 1.389968 | 1.389968 | 1.213592 | 1.213592 | 1.064725 | 1.064725 | 1.487055 | 1.487055 |
| 55 | 2022 | 1 | 1.0 | 0.293814 | 0.293814 | 0.146907 | 0.146907 | 0.346649 | 0.346649 | 1.054124 | ... | 2.233247 | 2.233247 | 0.971649 | 0.971649 | 1.034794 | 1.034794 | 1.369845 | 1.369845 | 1.278351 | 1.278351 |
| 56 | 2022 | 2 | 1.0 | 1.804494 | 1.804494 | 0.434831 | 0.434831 | 0.905618 | 0.905618 | 0.957303 | ... | 1.422472 | 1.422472 | 1.229213 | 1.229213 | 0.924719 | 0.924719 | 1.451685 | 1.451685 | 0.892135 | 0.892135 |
5 rows × 35 columns
reported_cost_renov4.tail(1)
| COMMUNITY_AREA | YEAR | QUARTER | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | ... | 68.0 | 69.0 | 70.0 | 71.0 | 72.0 | 73.0 | 74.0 | 75.0 | 76.0 | 77.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 60 | 2023 | 1 | 0.246773 | 0.499056 | 1.565268 | 0.39376 | 0.480358 | 0.361615 | 0.529522 | 0.249376 | ... | 0.121149 | 6.474478 | 0.17894 | 0.450188 | 0.390287 | 0.715368 | 0.409327 | 0.549577 | 0.309223 | 0.887154 |
1 rows × 79 columns
pr_change_df2.iloc[:,2:].plot(figsize=(20,10),grid=True)
<AxesSubplot:>
Prediction 1: let's suppose increased applications cause accelerate price change in one year
#Ok these are our price changes (target variable) - the last quarter that we have data for is quarter 2 2022
#Hence for our renovation costs we should only have data up until quarter 2 2021
pr_change_df2.head(1)
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 2 | 1.0 | 0.253913 | 0.253913 | -0.695652 | -0.695652 | 0.556522 | 0.556522 | 0.730435 | ... | 1.92 | 1.92 | 1.066087 | 1.066087 | 1.34087 | 1.34087 | 0.751304 | 0.751304 | 0.853913 | 0.853913 |
1 rows × 35 columns
pr_change_df2[]
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2008.0 | 2.0 | 1.0 | 0.253913 | 0.253913 | -0.695652 | -0.695652 | 0.556522 | 0.556522 | 0.730435 | ... | 1.92 | 1.92 | 1.066087 | 1.066087 | 1.34087 | 1.34087 | 0.751304 | 0.751304 | 0.853913 | 0.853913 |
5 rows × 35 columns
pr_change_df2.tail(1)
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | 2022 | 2 | 1.0 | 1.804494 | 1.804494 | 0.434831 | 0.434831 | 0.905618 | 0.905618 | 0.957303 | ... | 1.422472 | 1.422472 | 1.229213 | 1.229213 | 0.924719 | 0.924719 | 1.451685 | 1.451685 | 0.892135 | 0.892135 |
1 rows × 35 columns
reported_cost_renov4.head(1)
| COMMUNITY_AREA | YEAR | QUARTER | 1.0 | 2.0 | 3.0 | 4.0 | 5.0 | 6.0 | 7.0 | 8.0 | ... | 68.0 | 69.0 | 70.0 | 71.0 | 72.0 | 73.0 | 74.0 | 75.0 | 76.0 | 77.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2008 | 1 | 0.041919 | 0.142527 | 0.079029 | 0.047343 | 0.131885 | 0.135761 | 0.147981 | 0.150038 | ... | 0.030708 | 0.017003 | 0.076145 | 0.094289 | 0.110724 | 0.033966 | 0.057297 | 0.166993 | 0.040587 | 0.085271 |
1 rows × 79 columns
renov_final_t=reported_cost_renov4.iloc[0:-7]
renov_final=renov_final_t[areas]
renov_final.head()
| COMMUNITY_AREA | 3.0 | 1.0 | 6.0 | 7.0 | 4.0 | 5.0 | 16.0 | 14.0 | 15.0 | 11.0 | ... | 68.0 | 69.0 | 38.0 | 41.0 | 72.0 | 75.0 | 71.0 | 44.0 | 46.0 | 53.0 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.079029 | 0.041919 | 0.135761 | 0.147981 | 0.047343 | 0.131885 | 0.060267 | 0.116731 | 0.210126 | 0.049650 | ... | 0.030708 | 0.017003 | 0.028022 | 0.120351 | 0.110724 | 0.166993 | 0.094289 | 0.159209 | 0.069464 | 0.046393 |
| 1 | 0.670424 | 0.260806 | 0.363951 | 0.264998 | 0.584058 | 1.950902 | 0.814462 | 1.162274 | 1.438010 | 1.227525 | ... | 0.143802 | 0.191602 | 1.868217 | 0.582116 | 1.750199 | 0.675497 | 0.245415 | 0.377882 | 0.365320 | 0.548055 |
| 2 | 0.452126 | 0.175653 | 0.621766 | 0.369282 | 0.601317 | 0.965578 | 0.258266 | 0.458249 | 0.345333 | 0.563529 | ... | 0.424320 | 1.899765 | 0.403069 | 0.520168 | 0.453148 | 0.362276 | 0.422803 | 0.108622 | 0.275831 | 0.275310 |
| 3 | 0.533570 | 0.710546 | 0.646972 | 0.389183 | 1.048825 | 0.334408 | 0.202372 | 0.311119 | 0.267434 | 0.847954 | ... | 0.119007 | 1.322207 | 0.422614 | 2.471277 | 0.522336 | 1.010354 | 1.185354 | 0.655286 | 1.287131 | 1.743544 |
| 4 | 0.671016 | 0.459152 | 0.556160 | 0.337366 | 0.307196 | 0.763959 | 0.234350 | 0.251509 | 0.490572 | 0.144482 | ... | 0.547294 | 0.323428 | 0.286429 | 0.516118 | 0.024312 | 0.369397 | 0.084672 | 0.093325 | 0.269063 | 2.288765 |
5 rows × 32 columns
price_final=pr_change_df2.iloc[3:,3:].reset_index(drop=True)
price_final.shape
(54, 32)
#these are all areas
#reported_cost_renov4.iloc[0:-3,2:].plot(figsize=(30,15),grid=True);
renov_costs_change=reported_cost_renov4.iloc[0:-3,2:].loc[:,areas]
renov_costs_change.plot(figsize=(30,15), grid=True)
<AxesSubplot:>
price_change=pr_change_df2.iloc[:,3:]
pr_change_df2.tail(1)
| YEAR | QUARTER | CHICAGO | 3 | 1 | 6 | 7 | 4 | 5 | 16 | ... | 68 | 69 | 38 | 41 | 72 | 75 | 71 | 44 | 46 | 53 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | 2022 | 2 | 1.0 | 1.804494 | 1.804494 | 0.434831 | 0.434831 | 0.905618 | 0.905618 | 0.957303 | ... | 1.422472 | 1.422472 | 1.229213 | 1.229213 | 0.924719 | 0.924719 | 1.451685 | 1.451685 | 0.892135 | 0.892135 |
1 rows × 35 columns
price_change.plot(figsize=(20,10),grid=True)
<AxesSubplot:>
#renov_costs_change vs price_change
renov_costs_change.shape
(58, 32)
price_change.shape
(57, 32)
#price_final vs renov_final
import seaborn as sns
sns.scatterplot(x=renov_list,y=price_list)
<AxesSubplot:>
test=pd.DataFrame({'x':renov_list,'y':price_list})
test.corr()
| x | y | |
|---|---|---|
| x | 1.000000 | 0.013957 |
| y | 0.013957 | 1.000000 |
import statsmodels.api as sm
from scipy.stats import norm
from scipy import stats
stats.pearsonr(renov_list,price_list)
PearsonRResult(statistic=0.01395689006815116, pvalue=0.5620593208882761)
renov_final.shape
(54, 32)
price_final.shape
(5, 32)
renov_list=[]
for col in renov_final.columns:
for item in renov_final[col]:
renov_list.append(item)
len(renov_list)
1728
price_list=[]
for col in price_final.columns:
for item in price_final[col]:
price_list.append(item)
len(price_list)
1728